clear all
capture log close
program drop _all
set more off

log using "..\Programs/Preparing Global Migration Numbers.log", replace

*** Preparing Global Migration Numbers.do
*** 7/25/2016

*************************************************************
*************************************************************
*** Global population 1850-1950 - from UN Dept. of Economic and Social Affairs 
*************************************************************
*************************************************************

*************************************
*** Read in the data
*************************************
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("UN Historical Population") firstrow
list if inlist(year, 1850, 1900, 1950)
keep if inlist(year, 1850, 1900, 1950)

*************************************
*** Use log growth to estimate population in between the years
*************************************

*** Expand dataset so that we can store annual values from 1846-1949
tsset year 
tsfill, full
expand 5 if _n == 1, gen(new)
gsort -new year
replace year = year - _n if new == 1
foreach var of varlist Africa-Oceania World  {
	replace `var' = . if new == 1
}
drop new

foreach var of varlist Africa-Oceania {

	*** Get log of population for this region
	gen ln`var'_orig = log(`var')
	drop `var'
	
	*** Interpolate log of population 
	ipolate ln`var'_orig year, gen(ln`var') epolate
	
	*** Exponentiate interpolated log population values
	gen `var' = exp(ln`var')
	drop ln*
}

*************************************
*** Add up the global total
*************************************
drop World
egen World = rowtotal(Africa-Oceania)
order World, after(year)
list if inlist(year, 1850, 1900, 1950)

*************************************
*** Multiply by a million (numbers were saved in millions)
*************************************
foreach var of varlist World-Oceania {
	replace `var' = `var' * 1000000
}

*** Save tempfile
drop if year == 1950
tempfile globalpop_1850_1949
save `globalpop_1850_1949'.dta, replace

*************************************************************
*************************************************************
*** Global population 1950-2015 - from UN Dept. of Economic and Social Affairs 
*************************************************************
*************************************************************

*************************************
*** Read in the data
*************************************
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("UN Population 1950-2015") firstrow

*** Save tempfile
tempfile globalpop_1950_2015
save `globalpop_1950_2015'.dta, replace

*************************************************************
*************************************************************
*** Combine global population for 1850-2015
*************************************************************
*************************************************************
clear all
append using `globalpop_1850_1949'.dta `globalpop_1950_2015'.dta

*** Show population by region
*twoway line Africa-Oceania year
*twoway line World year

*************************************
*** Save the data
*************************************
keep World year
tempfile global_pop
save `global_pop'.dta, replace

*************************************************************
*************************************************************
*** Global migration 1850-1940 -- McKeown 2004
*************************************************************
*************************************************************

*************************************
*** Read in the data
*************************************
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("McKeown Global Migrant Flows") firstrow
rename NorthAsia mig_flow_nasia
rename SoutheastAsia mig_flow_seasia
rename TransAtlantic mig_flow_transatlantic 

*** Mark years correctly, at the start of each range
tab YearRange, m
sort YearRange
gen year = .
local i = 1
forvalues yr = 1846(5)1936 {
	replace year = `yr' if _n == `i'
	local i = `i' + 1
}
list YearRange year
drop YearRange

*************************************
*** Calculate global total
*************************************
egen global_migration = rowtotal(mig_flow*)
replace global_migration = global_migration
label variable global_migration "Global Migration"

*************************************
*** Spread out the migrant flows over 5 years (since we have 5 year totals)
*************************************
foreach var of varlist mig_flow_* global_migration {
	replace `var' = `var'/5
}
expand (5)
sort year
by year: egen new_num = seq()
rename year year_orig
gen year = year_orig + new_num - 1
drop year_orig new_num

*************************************
*** Prepare and merge in migrant return rates
*************************************

*** First, read in McKeown's disaggregated Southeast Asian migratoin numbers for China and India 
**** (We will use these to weight his return rates for China and India by the relative size of the Chinese and Indian migrant flows
**** in order to get a single weighted return rate for all of Southeast Asia)
preserve
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("McKeown Migrant Flows CHN IND") firstrow
rename China mig_flow_china
rename India mig_flow_india
*** Mark years correctly, at the start of each range
tab YearRange, m
sort YearRange
gen year = .
local i = 1
forvalues yr = 1846(5)1936 {
	replace year = `yr' if _n == `i'
	local i = `i' + 1
}
*** Spread out the migrant flows over 5 years (since we have 5 year totals)
foreach var of varlist mig_flow_* {
	replace `var' = `var'/5
}
expand (5)
sort year
by year: egen new_num = seq()
rename year year_orig
gen year = year_orig + new_num - 1
keep year mig_flow_*
tempfile chn_ind
save `chn_ind'.dta, replace
restore

*** Read in the return rates
preserve
import excel using "..\Input Data\Globalization Input Data.xlsx", sheet("McKeown Migrant Return Rates") clear firstrow

rename US return_rate_transatlantic
rename Chinese return_rate_china
rename Indian return_rate_india

*** Calculate a single return rate for all of Southeast Asia by weighting the Chinese and Indian return rate by the Chinese and Indian share of Southeast Asian migrant flows for each eyar
*** (Note that McKeown's migrant flow numbers for southeast Asia are simply the sum of his numbers for China and India)
merge 1:1 year using `chn_ind'.dta, assert(2 3) keep(3)
 
gen return_rate_seasia = mig_flow_china/(mig_flow_china + mig_flow_india) * return_rate_china ///
	+ mig_flow_india/(mig_flow_china + mig_flow_india) * return_rate_india

*** Use the Indian return rate for all Southeast Asia if the Chinese return rate is missing, and vice versa
replace return_rate_seasia = return_rate_china if return_rate_india == .
replace return_rate_seasia = return_rate_india if return_rate_china == .

keep year return_rate_transatlantic return_rate_seasia
tempfile return_rates
save `return_rates'.dta, replace
restore

merge 1:1 year using `return_rates'.dta, assert(1 3) nogen 

*** We are missing return rates for 1850-1870, so we will extend the average of the return rates from 1870-1875 backwards
foreach region in "seasia" "transatlantic" {
	egen mean_1870_1875_temp = mean(return_rate_`region') if year >=1870 & year <= 1875
	egen mean_1870_1875 = max(mean_1870_1875_temp)
	assert return_rate_`region' == . if year < 1870
	replace return_rate_`region' = mean_1870_1875 if year < 1870
	drop mean*
}

*** Calculate the return rate for north Asia as the weighted average of the transatlatnic return rate and the southeast Asian return rate
gen return_rate_nasia = mig_flow_transatlantic/(mig_flow_transatlantic + mig_flow_seasia) * return_rate_transatlantic ///
	+ mig_flow_seasia/(mig_flow_transatlantic + mig_flow_seasia) * return_rate_seasia

*** Drop 1939 and 1940 because we are missing return rate data
assert return_rate_transatlantic == . & return_rate_nasia == . & return_rate_seasia == . if inlist(year, 1939, 1940)
drop if inlist(year, 1939, 1940)

*************************************
**** Read in median US immigrant age - Historical Statistics of the United States
*************************************
preserve
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("US Migrant Age Distribution") firstrow

*** Calculate percentages in each category
foreach var of varlist youthful-older {
	gen `var'_pct = `var'/(youthful + midage + older)
}
assert youthful_pct < .5

*** Calculate the % of the middle age category where median will fall
gen mid_pct = (.5-youthful_pct)/midage_pct

*** Get the median age
gen median_age = (youth_bound + 1) + mid_pct * (mid_bound - (youth_bound + 1) + 1)

keep year median_age
tempfile med_age
save `med_age'.dta, replace
restore
merge 1:1 year using `med_age'.dta, keep(1 3) nogen

*** Round median age for merging
replace median_age = round(median_age)

*************************************
*** Prepare and merge in the life expectancy of migrants
*************************************

*** First, get % of migrants who are male so that we can weight the life expectancy for men and women accordingly (from Historical Statistics of the United States)
preserve
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("US Migrant Gender Ratio") firstrow
gen pct_male_orig = m_to_100f/(m_to_100f + 100)
*** Interpolate data for missing years, assuming a constant slope between points
drop if year > 1950
tsset year
tsfill, full
ipolate pct_male_orig year, gen(pct_male)
keep pct_male year
tempfile pct_male
save `pct_male'.dta, replace
restore

*** Next, get global life expectancy at birth numbers (from Riley), which we will use to adjust US life expectancy numbers downwards to reflect global life expectancy
preserve
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("Riley Global Life Exp at Birth") firstrow
*** Impute data for missing years, assuming a constant slope between points
tsset year
tsfill, full
rename global_expectancy global_expectancy_orig
ipolate global_expectancy_orig  year, gen(global_expectancy)
keep year global_expectancy 
tempfile glob_exp
save `glob_exp'.dta, replace
restore

*** Prepare life expectancy data - Historical Statistics of the United States
preserve
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("US Life Expectancy") firstrow
*** Clean years
gen year = .
local N = _N
forvalues i = 1/`N' {
	local num_years = year_end[`i'] - year_start[`i'] + 1
	quietly expand (`num_years') if _n == `i', gen(new)
	quietly replace year = year_start if _n == `i'
	quietly egen new_num = seq() if new == 1
	quietly replace year = year_start + new_num if new == 1
	drop new_num new
}
*** For cases where we have multiple values for life expectancy, 
**** keep the life expectancy numbers that apply to a shorter time range
sort year
duplicates tag year, gen(dup)
list if dup, ab(20) sepby(year)
gen time_range = year_end - year_start if dup > 0
bys year: egen min_time_range = min(time_range) if dup > 0
drop if dup > 0 & time_range != min_time_range
drop dup *time_range
isid year
drop year_*

*** Calculate average life expectancy for men and women combined, based on the % of migrants that were male
merge 1:1 year using `pct_male'.dta, keep(1 3) nogen norep
tab year if pct_male == .
*** Use 1870 values for the 1850s because we have no data for the 1850s
sort year
replace pct_male = pct_male[_n+1] if pct_male == .
replace pct_male = pct_male[_n+1] if pct_male == .
assert pct_male < .
foreach age in 0 20 40 60 {
	gen age`age' = pct_male * age`age'_m + (1-pct_male) * age`age'_f
	drop age`age'_m age`age'_f
}
drop pct_male

*** Interpolate life expectancy for every year 
tsset year 
tsfill, full 
foreach age in 0 20 40 60 { 
	rename age`age' age`age'_orig
	ipolate age`age' year, gen(age`age')
	drop age`age'_orig
}
*** Use 1850 values for pre-1850s 
expand (5) if year == 1850, gen(new)
egen new_num = seq() if new == 1
replace year = year - new_num if new == 1
drop new*

*** Adjust US life expectancy at various ages downward based on the ratio of global life expectancy at birth to US life expectancy at birth
merge 1:1 year using `glob_exp'.dta, assert(2 3) keep(3) nogen norep
gen ratio = global_expectancy/age0
foreach var of varlist age* {
	replace `var' = `var' * ratio
}
drop global_expectancy ratio

**** Interpolate life expectancy for every age fro 0 to 60 
rename age* expectancy*
reshape long expectancy, i(year) j(age)
reshape wide expectancy, i(age) j(year)
tsset age
tsfill, full
foreach var of varlist expectancy* {
	rename `var' `var'_orig
	ipolate `var'_orig age, gen(`var')
	drop `var'_orig
}
reshape long expectancy, i(age) j(year)

*** Save life expectancy tempfile
order year age expectancy
keep  year age expectancy
tempfile life_expectancy
save `life_expectancy'.dta, replace
restore

*** Merge in life expectancy
rename median_age age
merge 1:1 year age using `life_expectancy'.dta, assert(2 3) keep(3) nogen 


*************************************
*** Get the  # of immigrants from each year remaining in subsequent years,
*** assuming that that the average immigrant will live for "expectancy" # of years 
************************************

levelsof year, local(years)
foreach yr of local years {
	foreach region in "transatlantic" "nasia" "seasia" { 
		foreach var of varlist mig_flow_`region' expectancy {
			quietly gen `var'`yr'_temp = `var' if year == `yr'
			quietly egen `var'`yr' = max(`var'`yr'_temp)
		}
		quietly gen remaining_`region'`yr' = mig_flow_`region'`yr' if year > `yr'
		quietly replace remaining_`region'`yr' = 0 if year > `yr' + expectancy`yr'
		label variable remaining_`region'`yr' "# of People who Immigrated in `yr' Remaining"
		drop mig_flow*`yr'* expectancy`yr'*
	}
}

*************************************
*** Calculate the # of migrants who return so that the return rate is distributed evenly throughout the entire migrant stock,
**** and excluding the so-called returnees who are actually children of original migrants
*************************************

reshape long remaining_transatlantic remaining_nasia remaining_seasia, i(year return_rate_transatlantic return_rate_seasia return_rate_nasia expectancy) j(year_migrated)
drop if remaining_transatlantic == . &  remaining_nasia == . & remaining_seasia == . ///
	| remaining_transatlantic == 0 &  remaining_nasia == 0 & remaining_seasia == 0
snapshot save

*** Merge in population growth rates so that we can calculate the growth in the population of migrants who arrived each year
merge m:1 year using `globalpop_1850_1949'.dta, assert(2 3) keep(3) nogen keepusing(World)
rename World pop_current_year
rename year year_temp
rename year_migrated year
merge m:1 year using `globalpop_1850_1949'.dta,  assert(2 3) keep(3) nogen keepusing(World)
rename year year_migrated
rename year_temp year
rename World pop_year_migrated

*** Get growth rate between the year the migrants migrated and the current year
gen pop_growth = (pop_current_year - pop_year_migrated)/pop_year_migrated

foreach yr of local years {

	foreach region in "transatlantic" "nasia" "seasia" { 
		 /*snapshot restore 1
		 local yr = 1850
		 local region = "nasia"*/
		 
		*** Calculate the preliminary total migrant stock for this region
		quietly bys year: egen mig_stock_`region' = total(remaining_`region') if year == `yr'
		
		*** Calculate what % of the migrants from this region in year `yr' migrated in each year
		quietly gen mig_pct = remaining_`region'/mig_stock_`region' if year == `yr'
		
		*** Calculate the # of migrants that migrated in each (year_migrated) who returned in year `yr'
		*** such that migrants have an equal chance of returning, regardless of when they migrated
		*** and discounted to account for the growth of the migrant population (because many of the returning "migrants" will actually be the children of migrants")
		quietly gen num_returned_temp = return_rate_`region'/100 * 1/(1+pop_growth) * mig_pct * remaining_`region' if year == `yr'
		quietly bys year_migrated: egen num_returned = max(num_returned_temp)
		
		*** Subtract out the # of migrants that returned
		quietly replace remaining_`region' = remaining_`region' - num_returned if year > `yr' & num_returned < .
		quietly replace remaining_`region' = 0 if remaining_`region' < 0
		
		drop mig_stock_`region' mig_pct num_returned*
	}
}

keep remaining* year mig_flow* year_migrated
reshape wide remaining_transatlantic remaining_nasia remaining_seasia, i(year mig_flow*) j(year_migrated)

*************************************
*** Add up total global migration
*************************************

egen total_global_migration = rowtotal(mig_flow* remaining*), missing

*** Smooth the results
rename total_global_migration total_global_mig_orig
tsset year
tssmooth ma total_global_migration=total_global_mig_orig, window(1 1 1)
*twoway line total_global_migration total_global_mig_orig year

*** Save tempfile
keep year total_global_migration 
rename total_global_migration global_migration
tempfile globalmig_1850_1940
save `globalmig_1850_1940'.dta, replace

*************************************************************
*************************************************************
*** Global Migration 1960-2015
*************************************************************
*************************************************************

*************************************
*** Read in the data
*************************************
wbopendata, indicator(SM.POP.TOTL) clear

*** Keep only global aggregate
keep if countryname == "World"

*** Reshape
rename yr* global_migration*
keep countryname countrycode global_migration*
reshape long global_migration, i(countryname countrycode) j(year)

*** Save tempfile
tempfile globalmig_1960_2015
save `globalmig_1960_2015'.dta, replace

*************************************************************
*************************************************************
*** Combine global migration for 1850-2015
*************************************************************
*************************************************************
clear all
append using `globalmig_1850_1940'.dta `globalmig_1960_2015'.dta 

*************************************************************
*************************************************************
***  Calculate global migratation as a % of total population 
*************************************************************
*************************************************************

*************************************
*** Merge in population data
*************************************
merge 1:1 year using `global_pop'.dta, keep(3) nogen

keep if global_migration < .

*************************************
*** Calculate global migrants as % of world population
*************************************
gen migrants_pct = global_migration/World * 100
sort year
/*twoway (line migrants_pct year if year >= 1870 & year <= 1940) (line migrants_pct year if year >= 1960 & year <= 2015, lcolor(navy)), ytitle("Global Migrants (% World Population)") ///
	xtitle("Year") xlabel(1870(10)2015, angle(45)) xtick(1875(10)2010) legend(order(1))*/
	
*************************************
*** Save the data
*************************************
order year global_migration migrants_pct
keep year-migrants_pct
desc, f
save "Global Migration - Percent of Population.dta", replace

log close
